Automate sentiment analysis of textual comments and feedback¶
In [1]:
import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
data = pd.read_csv(r"d:\Users\Mayank\Desktop\RIO-125\RIO-125-1\preprocessed_kindle_review .csv")
In [3]:
df=data
df
Out[3]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 5 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 5 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 5 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 5 | This isn't the kind of book I normally read, a... | Great Story! |
| ... | ... | ... | ... | ... |
| 11995 | 11995 | 2 | Had to read certain passages twice--typos. Wi... | Where's the meat? |
| 11996 | 11996 | 3 | Not what i expected. yet a very interesting bo... | Interesting |
| 11997 | 11997 | 5 | Dragon Knights is a world where Knights ride d... | Dragon Knights, Wings of Change (I Dream of Dr... |
| 11998 | 11998 | 4 | Since this story is very short, it's hard to s... | Good writing, short story |
| 11999 | 11999 | 4 | from 1922 an amazing collection of info on sym... | interesting public domain book |
12000 rows × 4 columns
In [4]:
# printing dim of the data
data.shape
Out[4]:
(12000, 4)
In [5]:
# displaying all the columns of the dataset
data.columns
Out[5]:
Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
In [6]:
# quick review of the dataset
data.head()
Out[6]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 5 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 5 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 5 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 5 | This isn't the kind of book I normally read, a... | Great Story! |
In [7]:
# printing the first review from the dataset
data.reviewText[0]
Out[7]:
'This book was the very first bookmobile book I bought when I was in the school book club. I loved the story then and I bet a dollar to a donut I will love it again. If my memory serves, I bought this book in 5th grade. That would have been about 1961. I am looking forward to reliving the memories.'
In [8]:
# value_counts() function returns object containing counts of unique values.
# The resulting object will be in descending order so that the first element is the most frequently-occurring element.
a=data.rating.value_counts()
a
Out[8]:
rating 5 3000 4 3000 1 2000 3 2000 2 2000 Name: count, dtype: int64
In [9]:
# checking for null values
data.isnull().sum()
Out[9]:
Unnamed: 0 0 rating 0 reviewText 0 summary 2 dtype: int64
In [10]:
# PLotting Rating histogram
data.rating.hist()
plt.title("Distribution of rating using Matplotlib")
plt.show()
In [11]:
# PLotting Rating using ploty
fig = go.Figure([go.Bar(x=a.index, y=a.values,text=a.values)])
fig.update_layout(title='Distribution of the Rating using ploty')
fig.show()
Preprocessing Data¶
In [12]:
#Dropping columns that are not needed
data.columns
Out[12]:
Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
In [13]:
df = data.drop(['Unnamed: 0', 'summary'], axis=1)
df
Out[13]:
| rating | reviewText | |
|---|---|---|
| 0 | 5 | This book was the very first bookmobile book I... |
| 1 | 1 | When I read the description for this book, I c... |
| 2 | 5 | I just had to edit this review. This book is a... |
| 3 | 5 | I don't normally buy 'mystery' novels because ... |
| 4 | 5 | This isn't the kind of book I normally read, a... |
| ... | ... | ... |
| 11995 | 2 | Had to read certain passages twice--typos. Wi... |
| 11996 | 3 | Not what i expected. yet a very interesting bo... |
| 11997 | 5 | Dragon Knights is a world where Knights ride d... |
| 11998 | 4 | Since this story is very short, it's hard to s... |
| 11999 | 4 | from 1922 an amazing collection of info on sym... |
12000 rows × 2 columns
In [14]:
# preview of the dataset
data.head()
Out[14]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 5 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 5 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 5 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 5 | This isn't the kind of book I normally read, a... | Great Story! |
In [15]:
#converting rating to 0 and 1 from 1-5
# if rating is above 3 we will consider it as 1 else 0.
data["rating"] = data["rating"].apply(lambda x: 1 if x < 3 else 0) # positive as 0 and negative as 1
data
Out[15]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | This book was the very first bookmobile book I... | 50 + years ago... |
| 1 | 1 | 1 | When I read the description for this book, I c... | Boring! Boring! Boring! |
| 2 | 2 | 0 | I just had to edit this review. This book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | I don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 0 | This isn't the kind of book I normally read, a... | Great Story! |
| ... | ... | ... | ... | ... |
| 11995 | 11995 | 1 | Had to read certain passages twice--typos. Wi... | Where's the meat? |
| 11996 | 11996 | 0 | Not what i expected. yet a very interesting bo... | Interesting |
| 11997 | 11997 | 0 | Dragon Knights is a world where Knights ride d... | Dragon Knights, Wings of Change (I Dream of Dr... |
| 11998 | 11998 | 0 | Since this story is very short, it's hard to s... | Good writing, short story |
| 11999 | 11999 | 0 | from 1922 an amazing collection of info on sym... | interesting public domain book |
12000 rows × 4 columns
In [16]:
# lowering the text of the review
data["reviewText"] = data["reviewText"].str.lower()
data.head()
Out[16]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | this book was the very first bookmobile book i... | 50 + years ago... |
| 1 | 1 | 1 | when i read the description for this book, i c... | Boring! Boring! Boring! |
| 2 | 2 | 0 | i just had to edit this review. this book is a... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | i don't normally buy 'mystery' novels because ... | Very good read. |
| 4 | 4 | 0 | this isn't the kind of book i normally read, a... | Great Story! |
In [17]:
# removing punctuation
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
data["reviewText"] = data["reviewText"].apply(lambda text: remove_punctuation(text))
data.head()
Out[17]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | this book was the very first bookmobile book i... | 50 + years ago... |
| 1 | 1 | 1 | when i read the description for this book i co... | Boring! Boring! Boring! |
| 2 | 2 | 0 | i just had to edit this review this book is an... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | i dont normally buy mystery novels because i j... | Very good read. |
| 4 | 4 | 0 | this isnt the kind of book i normally read alt... | Great Story! |
In [18]:
#removing stop words from the dataset
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data["reviewText"] = data["reviewText"].apply(lambda text: remove_stopwords(text))
data.head()
Out[18]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | book first bookmobile book bought school book ... | 50 + years ago... |
| 1 | 1 | 1 | read description book couldnt wait read downlo... | Boring! Boring! Boring! |
| 2 | 2 | 0 | edit review book believe got right updated rew... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | dont normally buy mystery novels dont like how... | Very good read. |
| 4 | 4 | 0 | isnt kind book normally read although try limi... | Great Story! |
In [19]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
pos_tagged_text = nltk.pos_tag(text.split())
return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
data["reviewText"] = data["reviewText"].apply(lambda text: lemmatize_words(text))
data.head()
Out[19]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | book first bookmobile book buy school book clu... | 50 + years ago... |
| 1 | 1 | 1 | read description book couldnt wait read downlo... | Boring! Boring! Boring! |
| 2 | 2 | 0 | edit review book believe get right update rewr... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | dont normally buy mystery novels dont like how... | Very good read. |
| 4 | 4 | 0 | isnt kind book normally read although try limi... | Great Story! |
Visualization on Dataset¶
In [20]:
# PLotting Rating histogram
data.rating.hist()
plt.title("Distribution of rating using Matplotlib")
plt.show()
In [21]:
# PLotting Rating using ploty
a = data.rating.value_counts()
fig = go.Figure([go.Bar(x=a.index, y=a.values,text=a.values)])
fig.update_layout(title='Distribution of the Rating using ploty')
fig.show()
In [22]:
#Plotting word cloud
text = " ".join(cat.split()[0] for cat in data.reviewText)
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
In [23]:
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [24]:
from collections import Counter
cnt = Counter()
for text in data["reviewText"].values:
for word in text.split():
cnt[word] += 1
cnt.most_common(10)
Out[24]:
[('book', 15397),
('story', 11027),
('read', 10023),
('like', 6207),
('one', 5949),
('character', 5677),
('get', 5453),
('love', 5059),
('good', 4763),
('would', 4093)]
In [25]:
cnt.get("book")
Out[25]:
15397
In [26]:
cn = cnt.most_common(10)
w = []
c = []
In [27]:
for i in cn:
w.append(i[0])
c.append(i[1])
In [28]:
#Pie chart for Most Frequent Words
fig = px.pie(data, values=c, names=w, color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Most Frequent Words")
fig.show()
In [29]:
#Displaying Rarewords
n_rare_words = 11
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
RAREWORDS
Out[29]:
{'1922',
'backgroung',
'don8216t',
'firedrake',
'gryphon',
'helos',
'insite',
'meaness',
'relm',
'symbols',
'twicetypos'}
Splitting the data¶
In [30]:
# preview of data
data
Out[30]:
| Unnamed: 0 | rating | reviewText | summary | |
|---|---|---|---|---|
| 0 | 0 | 0 | book first bookmobile book buy school book clu... | 50 + years ago... |
| 1 | 1 | 1 | read description book couldnt wait read downlo... | Boring! Boring! Boring! |
| 2 | 2 | 0 | edit review book believe get right update rewr... | Wiggleliscious/new toy ready/!! |
| 3 | 3 | 0 | dont normally buy mystery novels dont like how... | Very good read. |
| 4 | 4 | 0 | isnt kind book normally read although try limi... | Great Story! |
| ... | ... | ... | ... | ... |
| 11995 | 11995 | 1 | read certain passage twicetypos wish build rel... | Where's the meat? |
| 11996 | 11996 | 0 | expect yet interesting book usually don8216t r... | Interesting |
| 11997 | 11997 | 0 | dragon knight world knight ride dragon slay wi... | Dragon Knights, Wings of Change (I Dream of Dr... |
| 11998 | 11998 | 0 | since story short hard say much without give a... | Good writing, short story |
| 11999 | 11999 | 0 | 1922 amazing collection info symbols culture a... | interesting public domain book |
12000 rows × 4 columns
In [31]:
train, test = train_test_split(data, test_size = 0.3, stratify = data['rating'], random_state = 42)
In [32]:
cv= CountVectorizer(binary=True, min_df = 10, max_df = 0.95)
cv.fit_transform(train['reviewText'].values)
train_feature_set=cv.transform(train['reviewText'].values)
test_feature_set=cv.transform(test['reviewText'].values)
In [33]:
train_feature_set
Out[33]:
<8400x3773 sparse matrix of type '<class 'numpy.int64'>' with 322980 stored elements in Compressed Sparse Row format>
In [34]:
train_feature_set.shape[1]
Out[34]:
3773
In [35]:
cv.vocabulary_['book']
Out[35]:
422
In [36]:
y_train = train['rating'].values
y_test = test['rating'].values
Building our Model¶
In [37]:
lr = LogisticRegression(random_state = 42, max_iter=1000)
lr.fit(train_feature_set,y_train)
y_pred = lr.predict(test_feature_set)
print("Accuracy: ",round(metrics.accuracy_score(y_test,y_pred),3))
print("F1: ",round(metrics.f1_score(y_test, y_pred),3))
Accuracy: 0.823 F1: 0.725
In [38]:
cm1 = confusion_matrix(y_test, y_pred)
cm1
Out[38]:
array([[2123, 277],
[ 360, 840]], dtype=int64)
In [39]:
cm2 = confusion_matrix(y_test, y_pred,normalize='true')
cm2
Out[39]:
array([[0.88458333, 0.11541667],
[0.3 , 0.7 ]])
In [40]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm1,display_labels=lr.classes_)
disp.plot()
plt.show()
In [41]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm2,display_labels=lr.classes_)
disp.plot()
plt.show()
In [42]:
feature_importance = lr.coef_[0][:10]
for i,v in enumerate(feature_importance):
print('Feature: ', list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(i)], 'Score: ', v)
Feature: 099 Score: 0.18776820123670163 Feature: 10 Score: 0.31980915609027416 Feature: 100 Score: 0.4884943889179449 Feature: 11 Score: -0.3913364970987692 Feature: 12 Score: -0.46257777467167416 Feature: 13 Score: 0.8420683409084077 Feature: 14 Score: -0.6040472434596486 Feature: 15 Score: 0.9190523185535769 Feature: 16 Score: 0.45578601105113226 Feature: 17 Score: 0.43887300859858464
In [43]:
feature_importance = lr.coef_[0]
sorted_idx = np.argsort(feature_importance)
In [44]:
#Top words for the positive class (negative sentiment):
top_10_pos_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[range(-1,-11, -1)]]
print(top_10_pos_w)
['waste', 'delete', 'cardboard', 'sorry', 'depress', 'boring', 'thin', '25', 'ugh', 'weird']
In [45]:
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_pos_w, y=feature_importance[sorted_idx[range(-1,-11, -1)]])
plt.title("Most Important Words Used for Negative Sentiment",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);
In [46]:
#Top words for the negative class (positive sentiment):
top_10_neg_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[:10]]
print(top_10_neg_w)
['enjoyed', 'verne', 'enjoyable', 'shot', 'fun', 'hunter', 'thumb', 'thrill', 'loved', 'hot']
In [47]:
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_neg_w, y=feature_importance[sorted_idx[:10]])
plt.title("Most Important Words Used for Positive Sentiment",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);
Prediction on our sentences¶
In [48]:
lr.classes_#negative class first, positive class next
Out[48]:
array([0, 1], dtype=int64)
In [49]:
test_review = cv.transform(["I did not enjoy the book"])
p = lr.predict_proba(test_review)
s = lr.predict(test_review)
print("prob are:",p)
print("prediction are:",s)
prob are: [[0.78883762 0.21116238]] prediction are: [0]
In [50]:
#hyperparameter
pred_proba_df = pd.DataFrame(lr.predict_proba(test_feature_set))
threshold_list = [0.3,0.4,0.45,0.5]
for i in threshold_list:
print ('\n******** For i = {} ******'.format(i))
Y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
test_f1 = round(metrics.f1_score(y_test, Y_test_pred.loc[:,1].values),3)
print('F1: {}'.format(test_f1))
******** For i = 0.3 ****** F1: 0.737 ******** For i = 0.4 ****** F1: 0.732 ******** For i = 0.45 ****** F1: 0.732 ******** For i = 0.5 ****** F1: 0.725
C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead. C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead. C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead. C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
In [ ]: